/*
 * Copyright 2009 Nicolai Haehnle.
 * Copyright 2011 Tom Stellard <tstellar@gmail.com>
 * SPDX-License-Identifier: MIT
 */

#include "radeon_program_pair.h"

#include <stdio.h>

#include "util/glheader.h"
#include "util/ralloc.h"
#include "util/register_allocate.h"
#include "util/u_memory.h"

#include "r300_fragprog_swizzle.h"
#include "radeon_compiler.h"
#include "radeon_compiler_util.h"
#include "radeon_dataflow.h"
#include "radeon_list.h"
#include "radeon_regalloc.h"
#include "radeon_variable.h"

static void
scan_read_callback(void *data, struct rc_instruction *inst, rc_register_file file,
                   unsigned int index, unsigned int mask)
{
   struct regalloc_state *s = data;
   struct register_info *reg;
   unsigned int i;

   if (file != RC_FILE_INPUT)
      return;

   s->Input[index].Used = 1;
   reg = &s->Input[index];

   for (i = 0; i < 4; i++) {
      if (!((mask >> i) & 0x1)) {
         continue;
      }
      reg->Live[i].Used = 1;
      reg->Live[i].Start = 0;
      reg->Live[i].End = s->LoopEnd > inst->IP ? s->LoopEnd : inst->IP;
   }
}

static void
remap_register(void *data, struct rc_instruction *inst, rc_register_file *file, unsigned int *index)
{
   struct regalloc_state *s = data;
   const struct register_info *reg;

   if (*file == RC_FILE_TEMPORARY && s->Simple)
      reg = &s->Temporary[*index];
   else if (*file == RC_FILE_INPUT)
      reg = &s->Input[*index];
   else
      return;

   if (reg->Allocated) {
      *index = reg->Index;
   }
}

static void
alloc_input_simple(void *data, unsigned int input, unsigned int hwreg)
{
   struct regalloc_state *s = data;

   if (input >= s->NumInputs)
      return;

   s->Input[input].Allocated = 1;
   s->Input[input].File = RC_FILE_TEMPORARY;
   s->Input[input].Index = hwreg;
}

/* This functions offsets the temporary register indices by the number
 * of input registers, because input registers are actually temporaries and
 * should not occupy the same space.
 *
 * This pass is supposed to be used to maintain correct allocation of inputs
 * if the standard register allocation is disabled. */
static void
do_regalloc_inputs_only(struct regalloc_state *s)
{
   for (unsigned i = 0; i < s->NumTemporaries; i++) {
      s->Temporary[i].Allocated = 1;
      s->Temporary[i].File = RC_FILE_TEMPORARY;
      s->Temporary[i].Index = i + s->NumInputs;
   }
}

static unsigned int
is_derivative(rc_opcode op)
{
   return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY);
}

struct variable_get_class_cb_data {
   unsigned int *can_change_writemask;
   unsigned int conversion_swizzle;
   struct radeon_compiler *c;
};

static void
variable_get_class_read_cb(void *userdata, struct rc_instruction *inst,
                           struct rc_pair_instruction_arg *arg,
                           struct rc_pair_instruction_source *src)
{
   struct variable_get_class_cb_data *d = userdata;
   unsigned int new_swizzle = rc_adjust_channels(arg->Swizzle, d->conversion_swizzle);
   /* We can't just call r300_swizzle_is_native basic here, because it ignores the
    * extra requirements for presubtract. However, after pair translation we no longer
    * have the rc_src_register required for the native swizzle, so we have to
    * reconstruct it. */
   struct rc_src_register reg = {};
   reg.Swizzle = new_swizzle;
   reg.File = src->File;

   assert(inst->Type == RC_INSTRUCTION_PAIR);
   /* The opcode is unimportant, we can't have TEX here. */
   if (!d->c->SwizzleCaps->IsNative(RC_OPCODE_MAD, reg)) {
      *d->can_change_writemask = 0;
   }
}

static unsigned
variable_get_class(struct rc_variable *variable, const struct rc_class *classes)
{
   unsigned int i;
   unsigned int can_change_writemask = 1;
   unsigned int writemask = rc_variable_writemask_sum(variable);
   struct rc_list *readers = rc_variable_readers_union(variable);
   int class_index;

   if (!variable->C->is_r500) {
      struct rc_class c;
      struct rc_variable *var_ptr;
      /* The assumption here is that if an instruction has type
       * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
       * r300 and r400 can't swizzle the result of a TEX lookup. */
      for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) {
         if (var_ptr->Inst->Type == RC_INSTRUCTION_NORMAL) {
            writemask = RC_MASK_XYZW;
         }
      }

      /* Check if it is possible to do swizzle packing for r300/r400
       * without creating non-native swizzles. */
      class_index = rc_find_class(classes, writemask, 3);
      if (class_index < 0) {
         goto error;
      }
      c = classes[class_index];
      if (c.WritemaskCount == 1) {
         goto done;
      }
      for (i = 0; i < c.WritemaskCount; i++) {
         struct rc_variable *var_ptr;
         for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) {
            int j;
            unsigned int conversion_swizzle =
               rc_make_conversion_swizzle(writemask, c.Writemasks[i]);
            struct variable_get_class_cb_data d;
            d.can_change_writemask = &can_change_writemask;
            d.conversion_swizzle = conversion_swizzle;
            d.c = variable->C;
            /* If we get this far var_ptr->Inst has to
             * be a pair instruction.  If variable or any
             * of its friends are normal instructions,
             * then the writemask will be set to RC_MASK_XYZW
             * and the function will return before it gets
             * here. */
            rc_pair_for_all_reads_arg(var_ptr->Inst, variable_get_class_read_cb, &d);

            for (j = 0; j < var_ptr->ReaderCount; j++) {
               unsigned int old_swizzle;
               unsigned int new_swizzle;
               struct rc_reader r = var_ptr->Readers[j];
               if (r.Inst->Type == RC_INSTRUCTION_PAIR) {
                  old_swizzle = r.U.P.Arg->Swizzle;
               } else {
                  /* Source operands of TEX
                   * instructions can't be
                   * swizzle on r300/r400 GPUs.
                   */
                  can_change_writemask = 0;
                  break;
               }
               new_swizzle = rc_rewrite_swizzle(old_swizzle, conversion_swizzle);
               if (!r300_swizzle_is_native_basic(new_swizzle)) {
                  can_change_writemask = 0;
                  break;
               }
            }
            if (!can_change_writemask) {
               break;
            }
         }
         if (!can_change_writemask) {
            break;
         }
      }
   }

   if (variable->Inst->Type == RC_INSTRUCTION_PAIR) {
      /* DDX/DDY seem to always fail when their writemasks are
       * changed.*/
      if (is_derivative(variable->Inst->U.P.RGB.Opcode) ||
          is_derivative(variable->Inst->U.P.Alpha.Opcode)) {
         can_change_writemask = 0;
      }
   }
   for (; readers; readers = readers->Next) {
      struct rc_reader *r = readers->Item;
      if (r->Inst->Type == RC_INSTRUCTION_PAIR) {
         if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) {
            can_change_writemask = 0;
            break;
         }
         /* DDX/DDY also fail when their swizzles are changed. */
         if (is_derivative(r->Inst->U.P.RGB.Opcode) || is_derivative(r->Inst->U.P.Alpha.Opcode)) {
            can_change_writemask = 0;
            break;
         }
      }
   }

   class_index = rc_find_class(classes, writemask, can_change_writemask ? 3 : 1);
done:
   if (class_index > -1) {
      return classes[class_index].ID;
   } else {
   error:
      rc_error(variable->C, "Could not find class for index=%u mask=%u\n", variable->Dst.Index,
               writemask);
      return 0;
   }
}

static void
do_advanced_regalloc(struct regalloc_state *s)
{

   unsigned int i, input_node, node_count, node_index;
   struct ra_class **node_classes;
   struct rc_instruction *inst;
   struct rc_list *var_ptr;
   struct rc_list *variables;
   struct ra_graph *graph;
   const struct rc_regalloc_state *ra_state = s->C->regalloc_state;

   /* Get list of program variables */
   variables = rc_get_variables(s->C);
   node_count = rc_list_count(variables);
   node_classes = memory_pool_malloc(&s->C->Pool, node_count * sizeof(struct ra_class *));

   for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
      unsigned int class_index;
      /* Compute the live intervals */
      rc_variable_compute_live_intervals(var_ptr->Item);

      class_index = variable_get_class(var_ptr->Item, ra_state->class_list);
      node_classes[node_index] = ra_state->classes[class_index];
   }

   /* Calculate live intervals for input registers */
   for (inst = s->C->Program.Instructions.Next; inst != &s->C->Program.Instructions;
        inst = inst->Next) {
      rc_opcode op = rc_get_flow_control_inst(inst);
      if (op == RC_OPCODE_BGNLOOP) {
         struct rc_instruction *endloop = rc_match_bgnloop(inst);
         if (endloop->IP > s->LoopEnd) {
            s->LoopEnd = endloop->IP;
         }
      }
      rc_for_all_reads_mask(inst, scan_read_callback, s);
   }

   /* Compute the writemask for inputs. */
   for (i = 0; i < s->NumInputs; i++) {
      unsigned int chan, writemask = 0;
      for (chan = 0; chan < 4; chan++) {
         if (s->Input[i].Live[chan].Used) {
            writemask |= (1 << chan);
         }
      }
      s->Input[i].Writemask = writemask;
   }

   graph = ra_alloc_interference_graph(ra_state->regs, node_count + s->NumInputs);

   for (node_index = 0; node_index < node_count; node_index++) {
      ra_set_node_class(graph, node_index, node_classes[node_index]);
   }

   rc_build_interference_graph(graph, variables);

   /* Add input registers to the interference graph */
   for (i = 0, input_node = 0; i < s->NumInputs; i++) {
      if (!s->Input[i].Writemask) {
         continue;
      }
      for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
         struct rc_variable *var = var_ptr->Item;
         if (rc_overlap_live_intervals_array(s->Input[i].Live, var->Live)) {
            ra_add_node_interference(graph, node_index, node_count + input_node);
         }
      }
      /* Manually allocate a register for this input */
      ra_set_node_reg(graph, node_count + input_node,
                      get_reg_id(s->Input[i].Index, s->Input[i].Writemask));
      input_node++;
   }

   if (!ra_allocate(graph)) {
      rc_error(s->C, "Ran out of hardware temporaries\n");
      ralloc_free(graph);
      return;
   }

   /* Rewrite the registers */
   for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) {
      int reg = ra_get_node_reg(graph, node_index);
      unsigned int writemask = reg_get_writemask(reg);
      unsigned int index = reg_get_index(reg);
      struct rc_variable *var = var_ptr->Item;

      if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) {
         writemask = rc_variable_writemask_sum(var);
      }

      if (var->Dst.File == RC_FILE_INPUT) {
         continue;
      }
      rc_variable_change_dst(var, index, writemask);
   }

   ralloc_free(graph);
}

/**
 * @param user This parameter should be a pointer to an integer value.  If this
 * integer value is zero, then a simple register allocator will be used that
 * only allocates space for input registers (\sa do_regalloc_inputs_only).  If
 * user is non-zero, then the regular register allocator will be used
 * (\sa do_regalloc).
 */
void
rc_pair_regalloc(struct radeon_compiler *cc, void *user)
{
   struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler *)cc;
   struct regalloc_state s;
   int *do_full_regalloc = (int *)user;

   memset(&s, 0, sizeof(s));
   s.C = cc;
   s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1;
   s.Input = memory_pool_malloc(&cc->Pool, s.NumInputs * sizeof(struct register_info));
   memset(s.Input, 0, s.NumInputs * sizeof(struct register_info));

   s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1;
   s.Temporary = memory_pool_malloc(&cc->Pool, s.NumTemporaries * sizeof(struct register_info));
   memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info));

   rc_recompute_ips(s.C);

   c->AllocateHwInputs(c, &alloc_input_simple, &s);
   if (*do_full_regalloc) {
      do_advanced_regalloc(&s);
   } else {
      s.Simple = 1;
      do_regalloc_inputs_only(&s);
   }

   /* Rewrite inputs and if we are doing the simple allocation, rewrite
    * temporaries too. */
   for (struct rc_instruction *inst = s.C->Program.Instructions.Next;
        inst != &s.C->Program.Instructions; inst = inst->Next) {
      rc_remap_registers(inst, &remap_register, &s);
   }
}
